import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
german_credit=pd.read_csv('GermanCredit.csv')
german_credit.head()
| Duration | Amount | InstallmentRatePercentage | ResidenceDuration | Age | NumberExistingCredits | NumberPeopleMaintenance | Telephone | ForeignWorker | Class | ... | OtherInstallmentPlans.Bank | OtherInstallmentPlans.Stores | OtherInstallmentPlans.None | Housing.Rent | Housing.Own | Housing.ForFree | Job.UnemployedUnskilled | Job.UnskilledResident | Job.SkilledEmployee | Job.Management.SelfEmp.HighlyQualified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 1169 | 4 | 4 | 67 | 2 | 1 | 0 | 1 | Good | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 1 | 48 | 5951 | 2 | 2 | 22 | 1 | 1 | 1 | 1 | Bad | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 12 | 2096 | 2 | 3 | 49 | 1 | 2 | 1 | 1 | Good | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 42 | 7882 | 2 | 4 | 45 | 1 | 2 | 1 | 1 | Good | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 4 | 24 | 4870 | 3 | 4 | 53 | 2 | 2 | 1 | 1 | Bad | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
5 rows × 62 columns
colNames=german_credit.columns
#reclassify 'class' variable, 0='Bad',1='Good'
german_credit['Class']=german_credit['Class'].replace('Good',1).replace('Bad',0)
german_credit.head(3)
| Duration | Amount | InstallmentRatePercentage | ResidenceDuration | Age | NumberExistingCredits | NumberPeopleMaintenance | Telephone | ForeignWorker | Class | ... | OtherInstallmentPlans.Bank | OtherInstallmentPlans.Stores | OtherInstallmentPlans.None | Housing.Rent | Housing.Own | Housing.ForFree | Job.UnemployedUnskilled | Job.UnskilledResident | Job.SkilledEmployee | Job.Management.SelfEmp.HighlyQualified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 1169 | 4 | 4 | 67 | 2 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 1 | 48 | 5951 | 2 | 2 | 22 | 1 | 1 | 1 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 12 | 2096 | 2 | 3 | 49 | 1 | 2 | 1 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
3 rows × 62 columns
X=german_credit.drop('Class',axis=1)
y=german_credit.Class
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=3)
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=2)
y_pred = clf.predict(X_test)
y_pred
array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1], dtype=int64)
from sklearn import metrics
Accuracy = metrics.accuracy_score(y_test,y_pred)
Accuracy
0.7
target = [0,1]
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.46 0.36 0.41 85
1 0.77 0.83 0.80 215
accuracy 0.70 300
macro avg 0.62 0.60 0.60 300
weighted avg 0.68 0.70 0.69 300
#Let's use Gride dearch to get best parameter and later check accuracy and classification report
grid={'criterion':['gini','entropy'],
'max_depth':[90,100,110],
'min_samples_split':[8,10,12]}
clf_2=DecisionTreeClassifier()
tree_cv=GridSearchCV(clf_2,grid,cv=10,n_jobs=5,verbose=10)
tree_cv.fit(X_train,y_train)
print("tuned hyperparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)
Fitting 10 folds for each of 18 candidates, totalling 180 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 3 tasks | elapsed: 1.9s [Parallel(n_jobs=5)]: Done 8 tasks | elapsed: 2.0s [Parallel(n_jobs=5)]: Done 15 tasks | elapsed: 2.0s [Parallel(n_jobs=5)]: Done 22 tasks | elapsed: 2.0s [Parallel(n_jobs=5)]: Batch computation too fast (0.1641s.) Setting batch_size=2. [Parallel(n_jobs=5)]: Done 31 tasks | elapsed: 2.0s [Parallel(n_jobs=5)]: Batch computation too fast (0.0247s.) Setting batch_size=4. [Parallel(n_jobs=5)]: Done 45 tasks | elapsed: 2.0s [Parallel(n_jobs=5)]: Batch computation too fast (0.0440s.) Setting batch_size=8. [Parallel(n_jobs=5)]: Done 79 tasks | elapsed: 2.1s [Parallel(n_jobs=5)]: Batch computation too fast (0.1080s.) Setting batch_size=16. [Parallel(n_jobs=5)]: Done 152 out of 180 | elapsed: 2.3s remaining: 0.3s [Parallel(n_jobs=5)]: Done 180 out of 180 | elapsed: 2.3s finished
tuned hyperparameters :(best parameters) {'criterion': 'gini', 'max_depth': 90, 'min_samples_split': 12}
accuracy : 0.7285714285714286
clf_2.fit(X_train,y_train)
y_pred = clf_2.predict(X_test)
y_pred
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0], dtype=int64)
Accuracy = metrics.accuracy_score(y_test,y_pred)
Accuracy
0.66
target = [0,1]
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.40 0.42 0.41 85
1 0.77 0.75 0.76 215
accuracy 0.66 300
macro avg 0.59 0.59 0.59 300
weighted avg 0.66 0.66 0.66 300
#! pip install dtreeviz
import sys
import os
# add library module to PYTHONPATH
sys.path.append(f"{os.getcwd()}/../")
from sklearn.datasets import *
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG
#import os
#os.environ["PATH"] += os.pathsep + r'C:\Users\deepa\anaconda3\pkgs\graphviz-2.38-hfd603c8_2\Library\bin'
viz = dtreeviz(clf_2,
X_train,
y_train,
target_name='Class', # this name will be displayed at the leaf node
feature_names=X.columns,
title="German Credit data set regression",
fontname="Arial",
title_fontsize=16,
colors = {"title":"purple"}
)
viz
# Random Forests
from sklearn.ensemble import RandomForestClassifier
# Your code here! :)
clf = RandomForestClassifier(n_estimators=10,random_state=0)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
clf.fit(X_train, y_train)
RandomForestClassifier(n_estimators=10, random_state=0)
y_pred = clf.predict(X_test)
y_pred
array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0], dtype=int64)
target = [0,1]
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.51 0.47 0.49 85
1 0.80 0.82 0.81 215
accuracy 0.72 300
macro avg 0.66 0.65 0.65 300
weighted avg 0.72 0.72 0.72 300
plt.rc("figure", figsize=(16,8))
plt.rc("font", size=14)
clf.fit(X_train, y_train)
Accuracy=clf.score(X, y)
print('Accuracy:',Accuracy,'\n')
importFeature = clf.feature_importances_
feature_importances=pd.DataFrame([importFeature])
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0)
indices = np.argsort(importFeature)[::-1]
# Print the feature ranking
print("Feature ranking:")
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importFeature[indices],color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
feature_importances=pd.DataFrame(pd.Series(colNames)[indices])
feature_importances['importance']=np.sort(importFeature)[::-1]
feature_importances.columns=['features','importance']
feature_importances
Accuracy: 0.911 Feature ranking:
| features | importance | |
|---|---|---|
| 1 | Amount | 0.104739 |
| 0 | Duration | 0.085347 |
| 4 | Age | 0.077534 |
| 12 | CheckingAccountStatus.gt.200 | 0.052844 |
| 3 | ResidenceDuration | 0.033434 |
| ... | ... | ... |
| 57 | Housing.ForFree | 0.001213 |
| 26 | Purpose.Vacation | 0.000863 |
| 22 | Purpose.Radio.Television | 0.000341 |
| 43 | Personal.Male.Married.Widowed | 0.000000 |
| 25 | Purpose.Education | 0.000000 |
61 rows × 2 columns